Lab 8: Spectral Factorizations

Part 1. Learning Spectral Components

In this part we will design a simple component analyzer. Use the sound file [https://drive.google.com/uc?export=download&id=1fyHhUqYVOrIPzSMJoQC78sqeokjqI4ib]. This is a drum loop with four distinct sounds (bass drum, snare drum, cymbal and synthetic bell sound). We will use a spectral factorization that will allow us to extract them all. Obtain the STFT of this signal and use a DFT size of 4096, a hop size of 256 and a Hann window. This will be stored in a matrix $\mathbf F$ whose size will be $M$ by $N$.

You now need to implement a factorization technique. This is defined as:

$$|\mathbf F | \approx \mathbf{W} \cdot \mathbf{H}$$$$\mathbf{F} \in \mathbb{R}^{M\times N}_+, \mathbf{W} \in \mathbb{R}^{M\times K}_+, \mathbf{H} \in \mathbb{R}^{K\times N}_+$$

Where $\mathbb{R}^{A\times B}_+$ is the set of matrices of size $A \times B$ containing non-negative elements, and $|\mathbf{F}|$ takes the absolute value of the STFT matrix $\mathbf{F}$. In this case we will use $K$=4 since the mix we are analyzing has four distinct sounds. To estimate the values of $\mathbf{W}$ and $\mathbf{H}$ start by filling them with uniformly random values between 10 to 11 and iterate over the following equations:

$$\mathbf{V} = \frac{|\mathbf{F}|}{\mathbf{W}\cdot \mathbf{H} + \epsilon}$$$$\mathbf{H} = \mathbf{H} \odot \left[ \mathbf{W}^\top \cdot \mathbf{V} \right]$$$$\mathbf{W} = \mathbf{W} \odot \left[ \mathbf{V} \cdot \mathbf{H}^\top \right]$$

Where $\odot$ denotes element-wise multiplication and the fraction performs element-wise division. The constant $\epsilon$ is assigned to a small value (e.g. 1e-7) to avoid division by zero. After each pass normalize the columns of $\mathbf{W}$ to sum to 1. Iterate for about 100 times.

Plot the columns of $\mathbf{W}$ and explain what they correspond to. Plot the rows of $\mathbf{H}$ and explain them as well. You might have to run the above procedure a couple of times since in some cases the results can come up wrong. Just to be safe, run this a dozen times and show the results that are representative of the majority of the outputs (note that each time the ordering will be different, we only care about the shapes of these quantities, not their order).

You can now try to extract each component. Take each column of $\mathbf{W}$ and compute its outer product with its corresponding row of $\mathbf{H}$. This will approximate only one component of the input spectrogram. Plot all four products and explain what they look like. Use the phase of the original input to invert these resulting spectrograms to the time domain and listen to them. What do they sound like?

In [16]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io.wavfile import read
from scipy import signal
from copy import deepcopy
import math
import sklearn.preprocessing as sp
import random

def sound( x, rate=8000, label=''):
    from IPython.display import display, Audio, HTML
    if label is '':
        display( Audio( x, rate=rate))
    else:
        display( HTML( 
        '<style> table, th, td {border: 0px; }</style> <table><tr><td>' + label + 
        '</td><td>' + Audio( x, rate=rate)._repr_html_()[3:] + '</td></tr></table>'
        ))
        
def stft(input_sound, dft_size, hop_size, zero_pad, window):
    result = []
    for i in range(0, len(input_sound), hop_size):
        slot = []
        start = i
        end = i + dft_size
        if end >= len(input_sound):
            slot = input_sound[start:]
            zero = np.array([0] * (end - len(input_sound)))
            slot = np.concatenate([slot, zero])
        else:
            slot = input_sound[start:end]
        result.append(np.multiply(np.fft.rfft(np.concatenate([np.multiply(slot, window), np.array(zero_pad)])), 
                                  (hop_size * 2 / dft_size)))
    return np.array(result)

def show_graph(sub_fig, freq_data, amp_data_len, sample_rate, mode="spectrogram", title=""):
    if mode == "spectrogram":
        transposed_data = np.log(np.absolute(freq_data)).T
        t1 = np.linspace(0, amp_data_len / sample_rate, len(transposed_data[0]))
        t2 = np.linspace(0, sample_rate / 2, len(transposed_data))
        x1, x2 = np.meshgrid(t1, t2)
        sub_fig.pcolormesh(x1, x2, np.array(transposed_data))
        sub_fig.set_title(title)
        
sr_80, data_80 = read("80s-hi.wav")

# add background noise
sound(data_80, rate=sr_80, label='80s-hi original')
data_80 = data_80.tolist()
for i in range(len(data_80)):
    data_80[i] += random.randrange(-200, 200)

dft_size = 4096
hop_size = 256
window = window = signal.hann(dft_size)
zero_pad = []

spec_80 = stft(data_80, dft_size, hop_size, zero_pad, window)

fig, (fig_spec_80, fig_amp_80) = plt.subplots(nrows=2)

show_graph(fig_spec_80, spec_80, len(data_80), sr_80, "spectrogram", "80s-hi origin spectrogram")
fig_amp_80.plot(data_80)

fig.set_size_inches(6, 8)
fig.tight_layout()
plt.show()
80s-hi original
In [17]:
dozen_Hs = []
dozen_Ws = []

plt.figure(figsize=(10,30))

for i in range(0, 12):
    
    # initialize H and W
    # note: shape of spec_80 is N*M
    
    H = np.random.rand(4, spec_80.shape[0]) + 10
    W = np.random.rand(spec_80.shape[1], 4) + 10
    F_abs = np.absolute(spec_80.T)
    
    for j in range(0, 100):
        V = np.divide(F_abs, np.matmul(W, H) + 1e-7)
        new_H = np.multiply(H, np.matmul(W.T, V))
        new_W = np.multiply(W, np.matmul(V, H.T))
        H = new_H
        W = new_W / new_W.sum(axis=0,keepdims=1)
        
    dozen_Hs.append(H)
    dozen_Ws.append(W)
    
    plt.subplot(12, 2, i * 2 + 1)
    plt.plot(W)
    plt.ylim(0, 0.05)
    plt.tight_layout()
    plt.title("W, iteration %d" % i)
    plt.subplot(12, 2, i * 2 + 2)
    plt.plot(H.T)
    plt.tight_layout()
    plt.title("H, iteration %d" % i)

plt.show()
    
        
    
    

The W graphs indicates the differentiated factors for each of instruments.

The H graphs shows the detected activations of each factors.

Now, select the result No.9 and extract each compomnent.

In [19]:
def istft(stft_output, dft_size, hop_size, zero_pad, window):
    # YOUR CODE HERE
    result = np.array([0] * (hop_size * len(stft_output) - hop_size + dft_size), dtype=float)
    for i in range(len(stft_output)):
        each_wave = np.fft.irfft(stft_output[i], n=dft_size + len(zero_pad))[:dft_size]
        result[(i * hop_size):(i * hop_size + dft_size)] = np.add(result[(i * hop_size):(i * hop_size + dft_size)], np.multiply(np.multiply(each_wave, window), (0.2)))
    return result[:hop_size * len(stft_output)]

selected_H = dozen_Hs[9]
selected_W = dozen_Ws[9]

fig, subfig = plt.subplots(nrows=4)
instruments = []
spec = np.zeros(spec_80.T.shape, dtype=float)
for i in range(4):    
    spec = np.outer(selected_W[:,i:i+1].T[0], selected_H[i:i+1][0])
    
    show_graph(subfig[i], spec.T, len(data_80), sr_80, "spectrogram", "instrument %d spectrogram" % i)
    extracted_spec = spec_80 * np.divide(np.absolute(spec.T), np.absolute(spec_80)+1e-7)
    extracted_sound = istft(extracted_spec, dft_size, hop_size, zero_pad, window)
    sound(extracted_sound, rate=sr_80, label='instrument %d sound' % i)
    

fig.set_size_inches(6, 16)
fig.tight_layout()
plt.show()
    


    


    
instrument 0 sound
instrument 1 sound
instrument 2 sound
instrument 3 sound

Four spectrographs represent hithat, claps, artificial tone and bass drum respectively. The bright vertical lines indicates the activations of the instrument. By the way, I am not sure the reason why some blue veritcal lines appear, but by adding noise to original sound this problem can be mitigated.

Part 2: Trainign Dictionaries for Source Separation

In this section we will design a system that separates speech of a known speaker from a known type of noise. Use the sound files [https://drive.google.com/uc?export=download&id=1fhTSrXDDbsp06oqlrBL9p9JToC4Mws11 ] and [https://drive.google.com/uc?export=download&id=1fmtVWSLp5ZB5_pkFvr8BEXSj0bRuNe70] from the lab archive.

One of them is of speech and the other one of chimes. Take the first sentence of the speech sound and a segment which is just as long from the beginning of the chime sound and add them together. This will be a mixture that we will try to separate. The rest of the data we will use for training dictionary models. Taking the rest of the speech data run a factorization as we’ve done above with $K$ = 40. Do the same with the remaining chime sound. From these you will obtain two matrices $\mathbf{W}_s$ and $\mathbf{W}_c$. These are the dictionaries of the two sounds. If you visually inspect them you will see that they look a lot like representative spectra of these two sounds.

In order to resolve the mixture we need to use these dictionaries to explain its spectrogram and then only use each dictionary’s contribution to resynthesize a time signal. This essentially involves finding the $\mathbf{H}$ matrix while fixing the $\mathbf{W}$ matrix to be a concatenation of $\mathbf{W}_s$ and $\mathbf{W}_c$. You can do that using the iterative approach used in the previous part, but only updating $\mathbf{H}$ and not updating $\mathbf{W}$ at every iteration. If you do this on the mixture you will ultimately get a $\mathbf{H}$ that will let us know how to combine the elements of the pretrained dictionaries to approximate the input.

To extract the two sounds you need to isolate the contribution of the two dictionaries on the mixture. That will be $\mathbf{F}_s = \mathbf{W}_s \cdot \mathbf{H}_s$ and $\mathbf{F}_c = \mathbf{W}_c \cdot \mathbf{H}_c$, where $\mathbf{H}_s$ corresponds to the first 40 rows of $\mathbf{H}$ and $\mathbf{H}_c$ to its second 40 rows. $\mathbf{F}_s$ and $\mathbf{F}_c$ will correspond to the magnitude spectrograms of the two extracted sources. Just as before use the phase of the input mixture to invert these back to the time domain and listen to them. Do they sound like they are separated? Play around with the STFT parameters until you get the best sounding results.

In [141]:
dft_size = 4096
hop_size = 256
window = window = signal.hann(dft_size)
zero_pad = []

sr_c, data_c = read("chimes.wav")
sound(data_c, rate=sr_c, label='chimes original')
sr_s, data_s = read("speaker.wav")
sound(data_s, rate=sr_s, label='speaker original')

mixture = np.add(data_c[:56000], data_s[:56000])
sound(mixture, rate=sr_s, label='mix')

# train
spec_c = stft(data_c, dft_size, hop_size, zero_pad, window)
spec_s = stft(data_s, dft_size, hop_size, zero_pad, window)

Hc = np.random.rand(40, spec_c.shape[0]) + 10
Wc = np.random.rand(spec_c.shape[1], 40) + 10
Fc_abs = np.absolute(spec_c.T)
    
for j in range(0, 100):
    Vc = np.divide(Fc_abs, np.matmul(Wc, Hc) + 1e-7)
    new_Hc = np.multiply(Hc, np.matmul(Wc.T, Vc))
    new_Wc = np.multiply(Wc, np.matmul(Vc, Hc.T))
    Hc = new_Hc
    Wc = new_Wc / new_Wc.sum(axis=0,keepdims=1)
    
#
Hs = np.random.rand(40, spec_s.shape[0]) + 10
Ws = np.random.rand(spec_s.shape[1], 40) + 10
Fs_abs = np.absolute(spec_s.T)
    
for j in range(0, 100):
    Vs = np.divide(Fs_abs, np.matmul(Ws, Hs) + 1e-7)
    new_Hs = np.multiply(Hs, np.matmul(Ws.T, Vs))
    new_Ws = np.multiply(Ws, np.matmul(Vs, Hs.T))
    Hs = new_Hs
    Ws = new_Ws / new_Ws.sum(axis=0,keepdims=1)
    
# classify
fig, subfig = plt.subplots(nrows=3)

spec_m = stft(mixture, dft_size, hop_size, zero_pad, window)
show_graph(subfig[0], spec_m, len(mixture), sr_c, "spectrogram", "mixture spectrogram")

Hm = np.random.rand(80, spec_m.shape[0]) + 10
Wm = np.concatenate([Ws, Wc], axis=1)
print(Ws.shape)
print(Wc.shape)
print(Wm.shape)
Fm_abs = np.absolute(spec_m.T)

for j in range(0, 100):
    Vm = np.divide(Fm_abs, np.matmul(Wm, Hm) + 1e-7)
    new_Hm = np.multiply(Hm, np.matmul(Wm.T, Vm))
    Hm = new_Hm
    
Hm_s = Hm[:40]
Hm_c = Hm[40:]

extracted_spec_s = spec_m * np.divide(np.matmul(Ws, Hm_s).T, np.absolute(spec_m) + 1e-7)
extracted_spec_c = spec_m * np.divide(np.matmul(Wc, Hm_c).T, np.absolute(spec_m) + 1e-7)

show_graph(subfig[1], extracted_spec_s, len(mixture), sr_c, "spectrogram", "extracted speaker spectrogram")
show_graph(subfig[2], extracted_spec_c, len(mixture), sr_c, "spectrogram", "extracted chimes spectrogram")

extracted_sound_s = istft(extracted_spec_s, dft_size, hop_size, zero_pad, window)
extracted_sound_c = istft(extracted_spec_c, dft_size, hop_size, zero_pad, window)

sound(extracted_sound_s, rate=sr_s, label='extracted speaker')
sound(extracted_sound_c, rate=sr_s, label='extracted chimes')

fig.set_size_inches(6, 12)
fig.tight_layout()
plt.show()
chimes original
speaker original
mix
(2049, 40)
(2049, 40)
(2049, 80)
extracted speaker
extracted chimes
In [ ]: